##Reading Data

options(max.print=999999999)
options(warn=-1)
library(caret)
library(readstata13)
library(TSstudio)
library(caret)
library(ROCR)
library(pROC)
library(xgboost)
library(gbm)
library(dplyr)
library(gtsummary)
library(xlsx)
library(rJava)
library(ggplot2)
library(segmented)
library(rpart)
library(tidyverse)
library(modelr)
library(dotwhisker)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(broom)
library(lme4)
library(MuMIn)
library(AICcmodavg)
library(psycho)
library(performance)
library(pbkrtest)
library(estimability)
library(emmeans)
library(lsmeans)
library(ggiraph)
library(ggiraphExtra)
require(plyr)



data2<-read.dta13(file = "Y:/Haroon Janjua/Project_Lap_Open_Rob_FL AHCA_Charges/Data_iHernia/Hernia_18_Robotic_Inflation.dta")

# looking at the dimensions of the dataset
dim(data2)
## [1] 8604   24
# looking at any missing values in our dataset
sum(is.na(data2))
## [1] 0
# taking a look at the structure of the varaibles in the dataset
str(data2[1:24])
## 'data.frame':    8604 obs. of  24 variables:
##  $ SYS_RECID          : num  62533124 62612463 62532927 65090208 63313825 ...
##  $ MCARE_NBR          : chr  "100006" "100006" "100006" "100006" ...
##  $ Tot_Open           : num  202 202 202 202 202 202 202 202 202 202 ...
##  $ Tot_Lap            : num  66 66 66 66 66 66 66 66 66 66 ...
##  $ Tot_Rob            : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ Tot_Hernia         : num  279 279 279 279 279 279 279 279 279 279 ...
##  $ TCHGS              : num  46136 26501 25429 42431 34879 ...
##  $ OPRMCHGS           : num  17676 13616 10716 18775 10281 ...
##  $ Non_OPR_CHGS       : num  28460 12885 14713 23656 24598 ...
##  $ COST               : num  7213 4143 3976 6634 5453 ...
##  $ Cost_OPR           : num  2764 2129 1675 2935 1607 ...
##  $ COST_Real          : num  7663 4402 4224 7048 5793 ...
##  $ COST_OPR_Real      : num  2936 2262 1780 3118 1708 ...
##  $ Non_OPR_Cost       : num  4450 2015 2300 3699 3846 ...
##  $ CCRATIO            : num  0.156 0.156 0.156 0.156 0.156 ...
##  $ OPR_PC             : num  0.383 0.514 0.421 0.442 0.295 ...
##  $ OPR_PC_CHG         : num  0.383 0.514 0.421 0.442 0.295 ...
##  $ TOTAL_HOSPITAL_BEDS: int  1454 1454 1454 1454 1454 1454 1454 1454 1454 1454 ...
##  $ VISITS             : int  1222 4755 790 1373 778 4987 4755 1362 778 1373 ...
##  $ TIME               : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ typeofproc2        : chr  "OPEN" "OPEN" "OPEN" "OPEN" ...
##  $ typeofproc         : num  1 1 1 1 1 2 1 1 1 1 ...
##  $ YEAR               : num  2017 2017 2017 2017 2017 ...
##  $ QTR                : int  1 1 1 4 2 2 1 2 2 4 ...
data2[c(3:19)] <- lapply(data2[,c(3:19)] , as.numeric)
data2$typeofproc2 <- as.factor(data2$typeofproc2)
data2$typeofproc <- as.factor(data2$typeofproc)

# taking a look at the structure of the varaibles in the dataset
str(data2[1:24])
## 'data.frame':    8604 obs. of  24 variables:
##  $ SYS_RECID          : num  62533124 62612463 62532927 65090208 63313825 ...
##  $ MCARE_NBR          : chr  "100006" "100006" "100006" "100006" ...
##  $ Tot_Open           : num  202 202 202 202 202 202 202 202 202 202 ...
##  $ Tot_Lap            : num  66 66 66 66 66 66 66 66 66 66 ...
##  $ Tot_Rob            : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ Tot_Hernia         : num  279 279 279 279 279 279 279 279 279 279 ...
##  $ TCHGS              : num  46136 26501 25429 42431 34879 ...
##  $ OPRMCHGS           : num  17676 13616 10716 18775 10281 ...
##  $ Non_OPR_CHGS       : num  28460 12885 14713 23656 24598 ...
##  $ COST               : num  7213 4143 3976 6634 5453 ...
##  $ Cost_OPR           : num  2764 2129 1675 2935 1607 ...
##  $ COST_Real          : num  7663 4402 4224 7048 5793 ...
##  $ COST_OPR_Real      : num  2936 2262 1780 3118 1708 ...
##  $ Non_OPR_Cost       : num  4450 2015 2300 3699 3846 ...
##  $ CCRATIO            : num  0.156 0.156 0.156 0.156 0.156 ...
##  $ OPR_PC             : num  0.383 0.514 0.421 0.442 0.295 ...
##  $ OPR_PC_CHG         : num  0.383 0.514 0.421 0.442 0.295 ...
##  $ TOTAL_HOSPITAL_BEDS: num  1454 1454 1454 1454 1454 ...
##  $ VISITS             : num  1222 4755 790 1373 778 ...
##  $ TIME               : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ typeofproc2        : Factor w/ 3 levels "LAP","OPEN","ROBOTIC": 2 2 2 2 2 1 2 2 2 2 ...
##  $ typeofproc         : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 2 1 1 1 1 ...
##  $ YEAR               : num  2017 2017 2017 2017 2017 ...
##  $ QTR                : int  1 1 1 4 2 2 1 2 2 4 ...

#Linear Regression Models

# fit LM for Open 
set.seed(5842)
model.open <- lm(COST_OPR_Real ~ TIME , data = data2, subset = typeofproc2 == "OPEN")
summary(model.open)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data2, subset = typeofproc2 == 
##     "OPEN")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1726.0  -599.7  -183.6   404.2 10188.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1975.91      32.56  60.679  < 2e-16 ***
## TIME           66.89       9.34   7.162 9.48e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 868.7 on 3913 degrees of freedom
## Multiple R-squared:  0.01294,    Adjusted R-squared:  0.01269 
## F-statistic: 51.29 on 1 and 3913 DF,  p-value: 9.478e-13
# fit LM for Lap
set.seed(5842)
model.lap <- lm(COST_OPR_Real ~ TIME , data = data2, subset = typeofproc2 == "LAP")
summary(model.lap)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data2, subset = typeofproc2 == 
##     "LAP")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2513.7  -954.1  -229.4   451.9 11775.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2447.07      76.70  31.905  < 2e-16 ***
## TIME          135.41      19.87   6.813  1.3e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1346 on 1784 degrees of freedom
## Multiple R-squared:  0.02536,    Adjusted R-squared:  0.02481 
## F-statistic: 46.42 on 1 and 1784 DF,  p-value: 1.303e-11
# fit LM for Rob
set.seed(5842)
model.rob <- lm(COST_OPR_Real ~ TIME , data = data2, subset = typeofproc2 == "ROBOTIC")
summary(model.rob)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data2, subset = typeofproc2 == 
##     "ROBOTIC")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3609.9 -1283.8  -429.8  1115.8 12683.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3469.76     115.97   29.92   <2e-16 ***
## TIME          332.29      27.65   12.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1923 on 2901 degrees of freedom
## Multiple R-squared:  0.04742,    Adjusted R-squared:  0.04709 
## F-statistic: 144.4 on 1 and 2901 DF,  p-value: < 2.2e-16
model.open$coefficients
## (Intercept)        TIME 
##  1975.91187    66.89208
model.lap$coefficients
## (Intercept)        TIME 
##   2447.0712    135.4059
model.rob$coefficients
## (Intercept)        TIME 
##   3469.7595    332.2948
#Analysis of Variance
set.seed(5842)
m.interaction <- lm(COST_OPR_Real ~ TIME*typeofproc2, data = data2)
anova(m.interaction)
## Analysis of Variance Table
## 
## Response: COST_OPR_Real
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## TIME                1 2.1602e+09 2160171078 1098.450 < 2.2e-16 ***
## typeofproc2         2 9.8216e+09 4910779486 2497.139 < 2.2e-16 ***
## TIME:typeofproc2    2 2.2089e+08  110443997   56.161 < 2.2e-16 ***
## Residuals        8598 1.6909e+10    1966562                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m.lst <- lstrends(m.interaction, "typeofproc2", var="TIME") ; m.lst
##  typeofproc2 TIME.trend   SE   df lower.CL upper.CL
##  LAP              135.4 20.7 8598     94.8    176.0
##  OPEN              66.9 15.1 8598     37.3     96.4
##  ROBOTIC          332.3 20.2 8598    292.8    371.8
## 
## Confidence level used: 0.95
pairs(m.lst)
##  contrast       estimate   SE   df t.ratio p.value
##  LAP - OPEN         68.5 25.6 8598   2.675  0.0205
##  LAP - ROBOTIC    -196.9 28.9 8598  -6.811  <.0001
##  OPEN - ROBOTIC   -265.4 25.2 8598 -10.540  <.0001
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

#Interactive Linear Model Plots

ggPredict(model.open,se=TRUE,interactive=TRUE)
ggPredict(model.lap,se=TRUE,interactive=TRUE)
ggPredict(model.rob,se=TRUE,interactive=TRUE)

#Non Interactive Combined Linear Model Plots

data2 %>%
  ggplot(aes(x=TIME, 
             y=COST_OPR_Real, 
             color=typeofproc2))+
  geom_point()+
  geom_smooth(method="lm")

data2 %>%
  ggplot(aes(x=TIME, 
             y=COST_OPR_Real, 
             color=typeofproc2))+
    geom_smooth(method="lm")

#Year by Year Linear Reg Models

data3<-read.dta13(file = "Y:/Haroon Janjua/Project_Lap_Open_Rob_FL AHCA_Charges/Data_iHernia/Hernia_18_Robotic_YEARLY_Inflation.dta")

# looking at the dimensions of the dataset
dim(data3)
## [1] 11266    15
# looking at any missing values in our dataset
sum(is.na(data3))
## [1] 45064
# taking a look at the structure of the varaibles in the dataset
str(data3[1:15])
## 'data.frame':    11266 obs. of  15 variables:
##  $ SYS_RECID    : num  65308574 64540880 62533219 64539131 63274186 ...
##  $ COST         : num  13124 10914 13713 7286 9474 ...
##  $ Cost_OPR     : num  7312 6531 7810 2809 4804 ...
##  $ COST_Real    : num  13942 11595 14569 7740 10065 ...
##  $ COST_OPR_Real: num  7768 6939 8297 2984 5103 ...
##  $ Non_OPR_Cost : num  5812 4383 5904 4477 4670 ...
##  $ TIME         : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ typeofproc2  : chr  "ROBOTIC" "ROBOTIC" "ROBOTIC" "ROBOTIC" ...
##  $ typeofproc   : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ YEAR         : num  2017 2017 2017 2017 2017 ...
##  $ typeofproc3  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeofproc4  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeofproc5  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeofproc6  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeproc2    : chr  "ROB_15-20" "ROB_15-20" "ROB_15-20" "ROB_15-20" ...
data3[c(2:6)] <- lapply(data3[,c(2:6)] , as.numeric)
data3$typeofproc2 <- as.factor(data3$typeofproc2)
data3$typeofproc <- as.factor(data3$typeofproc)
data3$typeproc2 <- as.factor(data3$typeproc2)

# taking a look at the structure of the varaibles in the dataset
str(data3[1:15])
## 'data.frame':    11266 obs. of  15 variables:
##  $ SYS_RECID    : num  65308574 64540880 62533219 64539131 63274186 ...
##  $ COST         : num  13124 10914 13713 7286 9474 ...
##  $ Cost_OPR     : num  7312 6531 7810 2809 4804 ...
##  $ COST_Real    : num  13942 11595 14569 7740 10065 ...
##  $ COST_OPR_Real: num  7768 6939 8297 2984 5103 ...
##  $ Non_OPR_Cost : num  5812 4383 5904 4477 4670 ...
##  $ TIME         : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ typeofproc2  : Factor w/ 2 levels "","ROBOTIC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ typeofproc   : Factor w/ 1 level "3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ YEAR         : num  2017 2017 2017 2017 2017 ...
##  $ typeofproc3  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeofproc4  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeofproc5  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeofproc6  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeproc2    : Factor w/ 5 levels "ROB_15-20","ROB_16-20",..: 1 1 1 1 1 1 1 1 1 1 ...
# fit LM for Rob 2015-2020
set.seed(5877)
model.rob <- lm(COST_OPR_Real ~ TIME , data = data3, subset = typeproc2 == "ROB_15-20")
summary(model.rob)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data3, subset = typeproc2 == 
##     "ROB_15-20")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3609.9 -1283.8  -429.8  1115.8 12683.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3469.76     115.97   29.92   <2e-16 ***
## TIME          332.29      27.65   12.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1923 on 2901 degrees of freedom
## Multiple R-squared:  0.04742,    Adjusted R-squared:  0.04709 
## F-statistic: 144.4 on 1 and 2901 DF,  p-value: < 2.2e-16
# fit LM for Rob 2016-2020
set.seed(5877)
model.rob16 <- lm(COST_OPR_Real ~ TIME , data = data3, subset = typeproc2 == "ROB_16-20")
summary(model.rob16)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data3, subset = typeproc2 == 
##     "ROB_16-20")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3637.8 -1305.2  -453.2  1103.8  9762.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3189.66     123.33   25.86   <2e-16 ***
## TIME          393.89      29.09   13.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1892 on 2835 degrees of freedom
## Multiple R-squared:  0.06076,    Adjusted R-squared:  0.06042 
## F-statistic: 183.4 on 1 and 2835 DF,  p-value: < 2.2e-16
# fit LM for Rob 2017-2020
set.seed(5877)
model.rob17 <- lm(COST_OPR_Real ~ TIME ,data = data3, subset = typeproc2 == "ROB_17-20")
summary(model.rob17)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data3, subset = typeproc2 == 
##     "ROB_17-20")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3680.9 -1391.3  -416.4  1106.2  9661.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2793.15     174.13   16.04   <2e-16 ***
## TIME          476.77      38.89   12.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1924 on 2469 degrees of freedom
## Multiple R-squared:  0.05738,    Adjusted R-squared:  0.05699 
## F-statistic: 150.3 on 1 and 2469 DF,  p-value: < 2.2e-16
# fit LM for Rob 2018-2020
set.seed(5877)
model.rob18 <- lm(COST_OPR_Real ~ TIME , data = data3, subset = typeproc2 == "ROB_18-20")
summary(model.rob18)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data3, subset = typeproc2 == 
##     "ROB_18-20")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3682.5 -1437.8  -375.7  1116.5  9660.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2783.75     322.39   8.635  < 2e-16 ***
## TIME          478.62      66.18   7.232 6.92e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1997 on 1851 degrees of freedom
## Multiple R-squared:  0.02748,    Adjusted R-squared:  0.02696 
## F-statistic: 52.31 on 1 and 1851 DF,  p-value: 6.923e-13
# fit LM for Rob 2019-2020
set.seed(5877)
model.rob19 <- lm(COST_OPR_Real ~ TIME ,data = data3, subset = typeproc2 == "ROB_19-20")
summary(model.rob19)
## 
## Call:
## lm(formula = COST_OPR_Real ~ TIME, data = data3, subset = typeproc2 == 
##     "ROB_19-20")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4132  -1374   -255    945   9211 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1411.7      670.9  -2.104   0.0356 *  
## TIME          1252.8      127.0   9.867   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1944 on 1200 degrees of freedom
## Multiple R-squared:  0.07505,    Adjusted R-squared:  0.07427 
## F-statistic: 97.36 on 1 and 1200 DF,  p-value: < 2.2e-16
model.rob$coefficients
## (Intercept)        TIME 
##   3469.7595    332.2948
model.rob16$coefficients
## (Intercept)        TIME 
##   3189.6572    393.8887
model.rob17$coefficients
## (Intercept)        TIME 
##   2793.1545    476.7681
model.rob18$coefficients
## (Intercept)        TIME 
##   2783.7495    478.6156
model.rob19$coefficients
## (Intercept)        TIME 
##   -1411.703    1252.761
#Analysis of Variance
m.interaction2 <- lm(COST_OPR_Real ~ TIME*typeproc2, data = data3)
anova(m.interaction2)
## Analysis of Variance Table
## 
## Response: COST_OPR_Real
##                   Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## TIME               1 2.3282e+09 2328190528 625.1016 < 2.2e-16 ***
## typeproc2          4 1.5788e+07    3946985   1.0597    0.3747    
## TIME:typeproc2     4 2.1189e+08   52973147  14.2229 1.394e-11 ***
## Residuals      11256 4.1923e+10    3724499                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m.interaction2$coefficients
##             (Intercept)                    TIME      typeproc2ROB_16-20 
##              3469.75952               332.29484              -280.10227 
##      typeproc2ROB_17-20      typeproc2ROB_18-20      typeproc2ROB_19-20 
##              -676.60497              -686.01007             -4881.46255 
## TIME:typeproc2ROB_16-20 TIME:typeproc2ROB_17-20 TIME:typeproc2ROB_18-20 
##                61.59384               144.47330               146.32080 
## TIME:typeproc2ROB_19-20 
##               920.46653
m.lst2 <- lstrends(m.interaction2, "typeproc2", var="TIME") ; m.lst2
##  typeproc2 TIME.trend    SE    df lower.CL upper.CL
##  ROB_15-20        332  27.8 11256      278      387
##  ROB_16-20        394  29.7 11256      336      452
##  ROB_17-20        477  39.0 11256      400      553
##  ROB_18-20        479  64.0 11256      353      604
##  ROB_19-20       1253 126.1 11256     1006     1500
## 
## Confidence level used: 0.95
pairs(m.lst2)
##  contrast                  estimate    SE    df t.ratio p.value
##  (ROB_15-20) - (ROB_16-20)   -61.59  40.6 11256  -1.516  0.5520
##  (ROB_15-20) - (ROB_17-20)  -144.47  47.9 11256  -3.017  0.0215
##  (ROB_15-20) - (ROB_18-20)  -146.32  69.7 11256  -2.099  0.2206
##  (ROB_15-20) - (ROB_19-20)  -920.47 129.1 11256  -7.130  <.0001
##  (ROB_16-20) - (ROB_17-20)   -82.88  49.0 11256  -1.691  0.4396
##  (ROB_16-20) - (ROB_18-20)   -84.73  70.5 11256  -1.202  0.7504
##  (ROB_16-20) - (ROB_19-20)  -858.87 129.5 11256  -6.631  <.0001
##  (ROB_17-20) - (ROB_18-20)    -1.85  74.9 11256  -0.025  1.0000
##  (ROB_17-20) - (ROB_19-20)  -775.99 132.0 11256  -5.880  <.0001
##  (ROB_18-20) - (ROB_19-20)  -774.15 141.4 11256  -5.476  <.0001
## 
## P value adjustment: tukey method for comparing a family of 5 estimates
set.seed(5877)
data3 %>%
  ggplot(aes(x=TIME, 
             y=COST_OPR_Real, 
             color=typeproc2))+
  geom_point()+
  geom_smooth(method="lm")

set.seed(5877)
data3 %>%
  ggplot(aes(x=TIME, 
             y=COST_OPR_Real, 
             color=typeproc2))+
  geom_smooth(method="lm")